###############
#Violence data#
###############

oldwd = getwd()
#Directory
setwd('./data')

#Get objects
old_ws = ls()

################################
#Load and combine WB data files#
################################

#List of world bank data files
wb_files = list.files('./NVMS/', pattern = "\\.dta$")

#Read into list
wb_list = wb_files %>% paste0('./NVMS/', .)  %>%
          lapply(function(x) read.dta(x, convert.factors = F)) %>% 
          lapply(as.data.table)

#Combine data
wb_panel = rbindlist(wb_list, fill = T)
rm(wb_list); gc()


##################
#Data preparation#
##################

#Assign unique event id
wb_panel[, unique_id := 1:.N]

#Prepare date
wb_panel[, date := as.Date(tanggal_kejadian, "%d/%m/%Y")]
wb_panel[, year := year(date)]

#Inspect Kecamatan
wb_panel[, missing_kecamatan := 1*(kodebpskec1 %in% 0)]
wb_panel[, kab_from_kec := str_sub(kodebpskec1, 1L, 4L)]
wb_panel[, kab_kec_flag := kodebpskab != kab_from_kec]


##################
#Select provinces#
##################

#Inspect province patterns
wb_panel[, list(count = length(unique_id)) , by = list(provinsi, kodebpsprop, year)] %>%
  dcast.data.table(., provinsi + kodebpsprop ~ year, value.var = 'count')

#Provinces to use
provinces_use = wb_panel[, list(count = length(unique_id)) , by = list(provinsi, kodebpsprop, year)] %>%
  .[, list(use = min(year) <= 2005), by = list(provinsi, kodebpsprop)] %>%
  .[use %in% T]

provinces_no_use = wb_panel[, list(count = length(unique_id)) , by = list(provinsi, kodebpsprop, year)] %>%
  .[, list(use = min(year) <= 2005), by = list(provinsi, kodebpsprop)] %>%
  .[use %in% F]


wb_panel_use = wb_panel[kodebpsprop %in% provinces_use$kodebpsprop]

#Drop events where state is an actor in violence
state_actors = c(5,6, 14:16, 19)
wb_panel_use = wb_panel_use[!(actor_s1_tp %in% state_actors)]


########################
#Define Election cycles#
########################
start_2004 = as.Date('2004-11-01')
end_2004 = as.Date('2009-04-05')
start_2009 = as.Date('2009-10-01')
end_2009 = as.Date('2014-04-09')
start_placebo = as.Date('1998-01-01')
end_placebo = as.Date('2004-04-05')
wb_panel_use[date %in% seq.Date(start_2004, end_2004, 'day'), election_cycle := "dprd_2004"]
wb_panel_use[date %in% seq.Date(start_2009, end_2009, 'day'), election_cycle := "dprd_2009"]
wb_panel_use[date %in% seq.Date(start_placebo, end_placebo, 'day'), election_cycle := "dprd_2004_placebo"]


##########################################
#Map violence to election-year kecamatans#
##########################################

crosswalk_2004 = fread('./crosswalks/kecamatan_nvms_to_dprd2004.csv') %>% .[, lapply(.SD, as.integer)]
setnames(crosswalk_2004, c('target_kecamatan', 'from_kecamatan'), c('kecamatan_2004', 'kodebpskec1'))
crosswalk_2009 = fread('./crosswalks/kecamatan_nvms_to_dprd2009.csv')  %>% .[, lapply(.SD, as.integer)]
setnames(crosswalk_2009, c('target_kecamatan', 'from_kecamatan'), c('kecamatan_2009', 'kodebpskec1'))

setkey(wb_panel_use, kodebpskec1)
setkey(crosswalk_2004, kodebpskec1)
setkey(crosswalk_2009, kodebpskec1)

wb_panel_use = crosswalk_2004[wb_panel_use]
wb_panel_use = crosswalk_2009[wb_panel_use]

######################
#Create violence data#
######################

#Main Violence Measures
#######################

#NVM_Violence_1
#violence coded as inter-religious, intra-religious, or any religious actor
wb_panel_use[, NVM_Violence_1 := ifelse((tp_kek1_new %in% 4404:4405) | (actor_s1_tp %in% 11 | actor_s2_tp %in% 11) | (actcountrelormas %in% 1), 1, 0)]

#NVM_Violence_2
#NVM_Violence_1 + attacks on places of vice + attacks for sorcery
wb_panel_use[, NVM_Violence_2 := ifelse((NVM_Violence_1 %in% 1) | (tp_kek1_new %in% 5510:5511), 1, 0)]

#NVM_Violence_3
#NVM_Violence_2 + attacks for sexual indiscretion
wb_panel_use[, NVM_Violence_3 := ifelse((NVM_Violence_2 %in% 1) | (tp_kek1_new %in% 5508), 1, 0)]

#NVM_Violence_4
#NVM_Violence_2 + all other identity based violence (excluding gender, school, sports)
wb_panel_use[, NVM_Violence_4 := ifelse((NVM_Violence_2 %in% 1) | (tp_kek1_new %in% c(4402:4403,4406:4408)), 1, 0)]

#Combine building damage and destrution
wb_panel_use[, building_damage_destroy := build_dmg_total + bdg_des]


######################################################################
#Figure: NVMS Violence Type 1 / NVMS Violence Type 1 Deaths over time#
######################################################################


wb_day_panel = wb_panel_use[, list(NVM_Violence_1 = sum(NVM_Violence_1),
                    NVM_Violence_1_deaths = sum(kil_total[NVM_Violence_1 == 1]),
                    All = .N)
                    , by = date]

a = ggplot(wb_day_panel[year(date) %in% c(2001:2014)], aes(date, weight = NVM_Violence_1)) + 
  geom_histogram(binwidth = 7) + xlab(NULL) + theme_bw() + 
  geom_vline(xintercept = c('2004-04-05', '2009-04-09', '2014-04-09') %>% as.Date %>% as.numeric,color = 'red') +
  ggtitle("Incidents of Religious Violence")

pdf('../output/figures/figure_e4.pdf', width = 11, height = 8.5)
plot(a)
dev.off()


##############################################
#Collapse to kecamatan within election cycles#
##############################################

#2009 election cycle
#Counts and intensive margins

f6_2009 = seq.Date(start_2009, start_2009 + 3*30, by = 1)
l6_2009 = seq.Date(end_2009 - 3*30, end_2009, by = 1)

wb_2009 = wb_panel_use[election_cycle %in% "dprd_2009", list(
              NVM_Violence_1_count = sum(NVM_Violence_1),
              NVM_Violence_2_count = sum(NVM_Violence_2),
              NVM_Violence_3_count = sum(NVM_Violence_3),
              NVM_Violence_4_count = sum(NVM_Violence_4),
              NVM_Violence_1_deaths = sum(kil_total[NVM_Violence_1 == 1]),
              NVM_Violence_2_deaths = sum(kil_total[NVM_Violence_2 == 1]),
              NVM_Violence_3_deaths = sum(kil_total[NVM_Violence_3 == 1]),
              NVM_Violence_4_deaths = sum(kil_total[NVM_Violence_4 == 1]),
              NVM_Violence_1_injured = sum(inj_total[NVM_Violence_1 == 1]),
              NVM_Violence_2_injured = sum(inj_total[NVM_Violence_2 == 1]),
              NVM_Violence_3_injured = sum(inj_total[NVM_Violence_3 == 1]),
              NVM_Violence_4_injured = sum(inj_total[NVM_Violence_4 == 1]),
              NVM_Violence_1_casualties = sum(kil_total[NVM_Violence_1 == 1]) + sum(inj_total[NVM_Violence_1 == 1]),
              NVM_Violence_2_casualties = sum(kil_total[NVM_Violence_2 == 1]) + sum(inj_total[NVM_Violence_2 == 1]),
              NVM_Violence_3_casualties = sum(kil_total[NVM_Violence_3 == 1]) + sum(inj_total[NVM_Violence_3 == 1]),
              NVM_Violence_4_casualties = sum(kil_total[NVM_Violence_4 == 1]) + sum(inj_total[NVM_Violence_4 == 1]),
              NVM_Violence_1_damage = sum(building_damage_destroy[NVM_Violence_1 == 1]),
              NVM_Violence_2_damage = sum(building_damage_destroy[NVM_Violence_2 == 1]),
              NVM_Violence_3_damage = sum(building_damage_destroy[NVM_Violence_3 == 1]),
              NVM_Violence_4_damage = sum(building_damage_destroy[NVM_Violence_4 == 1]),
              NVM_Violence_1_count_f3 = sum(NVM_Violence_1[date %in% f6_2009]),
              NVM_Violence_2_count_f3 = sum(NVM_Violence_2[date %in% f6_2009]),
              NVM_Violence_3_count_f3 = sum(NVM_Violence_3[date %in% f6_2009]),
              NVM_Violence_4_count_f3 = sum(NVM_Violence_4[date %in% f6_2009]),
              NVM_Violence_1_deaths_f3 = sum(kil_total[NVM_Violence_1 == 1 & date %in% f6_2009]),
              NVM_Violence_2_deaths_f3 = sum(kil_total[NVM_Violence_2 == 1 & date %in% f6_2009]),
              NVM_Violence_3_deaths_f3 = sum(kil_total[NVM_Violence_3 == 1 & date %in% f6_2009]),
              NVM_Violence_4_deaths_f3 = sum(kil_total[NVM_Violence_4 == 1 & date %in% f6_2009]),
              NVM_Violence_1_count_l3 = sum(NVM_Violence_1[date %in% l6_2009]),
              NVM_Violence_2_count_l3 = sum(NVM_Violence_2[date %in% l6_2009]),
              NVM_Violence_3_count_l3 = sum(NVM_Violence_3[date %in% l6_2009]),
              NVM_Violence_4_count_l3 = sum(NVM_Violence_4[date %in% l6_2009]),
              NVM_Violence_1_deaths_l3 = sum(kil_total[NVM_Violence_1 == 1 & date %in% l6_2009]),
              NVM_Violence_2_deaths_l3 = sum(kil_total[NVM_Violence_2 == 1 & date %in% l6_2009]),
              NVM_Violence_3_deaths_l3 = sum(kil_total[NVM_Violence_3 == 1 & date %in% l6_2009]),
              NVM_Violence_4_deaths_l3 = sum(kil_total[NVM_Violence_4 == 1 & date %in% l6_2009])
              )
        , by = kecamatan_2009]

#2004 election cycle
#Counts and intensive margins


f6_2004 = seq.Date(start_2004, start_2004 + 3*30, by = 1)
l6_2004 = seq.Date(end_2004 - 3*30, end_2004, by = 1)


wb_2004 = wb_panel_use[election_cycle %in% "dprd_2004", list(
  NVM_Violence_1_count = sum(NVM_Violence_1),
  NVM_Violence_2_count = sum(NVM_Violence_2),
  NVM_Violence_3_count = sum(NVM_Violence_3),
  NVM_Violence_4_count = sum(NVM_Violence_4),
  NVM_Violence_1_deaths = sum(kil_total[NVM_Violence_1 == 1]),
  NVM_Violence_2_deaths = sum(kil_total[NVM_Violence_2 == 1]),
  NVM_Violence_3_deaths = sum(kil_total[NVM_Violence_3 == 1]),
  NVM_Violence_4_deaths = sum(kil_total[NVM_Violence_4 == 1]),
  NVM_Violence_1_injured = sum(inj_total[NVM_Violence_1 == 1]),
  NVM_Violence_2_injured = sum(inj_total[NVM_Violence_2 == 1]),
  NVM_Violence_3_injured = sum(inj_total[NVM_Violence_3 == 1]),
  NVM_Violence_4_injured = sum(inj_total[NVM_Violence_4 == 1]),
  NVM_Violence_1_casualties = sum(kil_total[NVM_Violence_1 == 1]) + sum(inj_total[NVM_Violence_1 == 1]),
  NVM_Violence_2_casualties = sum(kil_total[NVM_Violence_2 == 1]) + sum(inj_total[NVM_Violence_2 == 1]),
  NVM_Violence_3_casualties = sum(kil_total[NVM_Violence_3 == 1]) + sum(inj_total[NVM_Violence_3 == 1]),
  NVM_Violence_4_casualties = sum(kil_total[NVM_Violence_4 == 1]) + sum(inj_total[NVM_Violence_4 == 1]),
  NVM_Violence_1_damage = sum(building_damage_destroy[NVM_Violence_1 == 1]),
  NVM_Violence_2_damage = sum(building_damage_destroy[NVM_Violence_2 == 1]),
  NVM_Violence_3_damage = sum(building_damage_destroy[NVM_Violence_3 == 1]),
  NVM_Violence_4_damage = sum(building_damage_destroy[NVM_Violence_4 == 1]),
  NVM_Violence_1_count_f3 = sum(NVM_Violence_1[date %in% f6_2004]),
  NVM_Violence_2_count_f3 = sum(NVM_Violence_2[date %in% f6_2004]),
  NVM_Violence_3_count_f3 = sum(NVM_Violence_3[date %in% f6_2004]),
  NVM_Violence_4_count_f3 = sum(NVM_Violence_4[date %in% f6_2004]),
  NVM_Violence_1_deaths_f3 = sum(kil_total[NVM_Violence_1 == 1 & date %in% f6_2004]),
  NVM_Violence_2_deaths_f3 = sum(kil_total[NVM_Violence_2 == 1 & date %in% f6_2004]),
  NVM_Violence_3_deaths_f3 = sum(kil_total[NVM_Violence_3 == 1 & date %in% f6_2004]),
  NVM_Violence_4_deaths_f3 = sum(kil_total[NVM_Violence_4 == 1 & date %in% f6_2004]),
  NVM_Violence_1_count_l3 = sum(NVM_Violence_1[date %in% l6_2004]),
  NVM_Violence_2_count_l3 = sum(NVM_Violence_2[date %in% l6_2004]),
  NVM_Violence_3_count_l3 = sum(NVM_Violence_3[date %in% l6_2004]),
  NVM_Violence_4_count_l3 = sum(NVM_Violence_4[date %in% l6_2004]),
  NVM_Violence_1_deaths_l3 = sum(kil_total[NVM_Violence_1 == 1 & date %in% l6_2004]),
  NVM_Violence_2_deaths_l3 = sum(kil_total[NVM_Violence_2 == 1 & date %in% l6_2004]),
  NVM_Violence_3_deaths_l3 = sum(kil_total[NVM_Violence_3 == 1 & date %in% l6_2004]),
  NVM_Violence_4_deaths_l3 = sum(kil_total[NVM_Violence_4 == 1 & date %in% l6_2004])
)
, by = kecamatan_2004]


#Collapse by kecamatan and date
wb_2004_daily = wb_panel_use[election_cycle %in% "dprd_2004", list(NVM_Violence_1 = sum(NVM_Violence_1),
                                   NVM_Violence_1_deaths = sum(kil_total[NVM_Violence_1 == 1]),
                                   All = .N)
                                  , by = list(kecamatan_2004, date)]

wb_2009_daily = wb_panel_use[election_cycle %in% "dprd_2009", list(NVM_Violence_1 = sum(NVM_Violence_1),
                                                                   NVM_Violence_1_deaths = sum(kil_total[NVM_Violence_1 == 1]),
                                                                   All = .N)
                                  , by = list(kecamatan_2009, date)]

######################################################
#PLACEBO Collapse to kecamatan within election cycles#
######################################################

#2009 election cycle PLACEBO
#Counts and intensive margins

wb_2009_placebo = wb_panel_use[election_cycle %in% "dprd_2004", list(
  NVM_Violence_1_count = sum(NVM_Violence_1),
  NVM_Violence_2_count = sum(NVM_Violence_2),
  NVM_Violence_3_count = sum(NVM_Violence_3),
  NVM_Violence_4_count = sum(NVM_Violence_4),
  NVM_Violence_1_deaths = sum(kil_total[NVM_Violence_1 == 1]),
  NVM_Violence_2_deaths = sum(kil_total[NVM_Violence_2 == 1]),
  NVM_Violence_3_deaths = sum(kil_total[NVM_Violence_3 == 1]),
  NVM_Violence_4_deaths = sum(kil_total[NVM_Violence_4 == 1]),
  NVM_Violence_1_injured = sum(inj_total[NVM_Violence_1 == 1]),
  NVM_Violence_2_injured = sum(inj_total[NVM_Violence_2 == 1]),
  NVM_Violence_3_injured = sum(inj_total[NVM_Violence_3 == 1]),
  NVM_Violence_4_injured = sum(inj_total[NVM_Violence_4 == 1]),
  NVM_Violence_1_casualties = sum(kil_total[NVM_Violence_1 == 1]) + sum(inj_total[NVM_Violence_1 == 1]),
  NVM_Violence_2_casualties = sum(kil_total[NVM_Violence_2 == 1]) + sum(inj_total[NVM_Violence_2 == 1]),
  NVM_Violence_3_casualties = sum(kil_total[NVM_Violence_3 == 1]) + sum(inj_total[NVM_Violence_3 == 1]),
  NVM_Violence_4_casualties = sum(kil_total[NVM_Violence_4 == 1]) + sum(inj_total[NVM_Violence_4 == 1]),
  NVM_Violence_1_damage = sum(building_damage_destroy[NVM_Violence_1 == 1]),
  NVM_Violence_2_damage = sum(building_damage_destroy[NVM_Violence_2 == 1]),
  NVM_Violence_3_damage = sum(building_damage_destroy[NVM_Violence_3 == 1]),
  NVM_Violence_4_damage = sum(building_damage_destroy[NVM_Violence_4 == 1])
)
, by = kecamatan_2009]

#2004 election cycle
#Counts and intensive margins

wb_2004_placebo = wb_panel_use[election_cycle %in% "dprd_2004_placebo", list(
  NVM_Violence_1_count = sum(NVM_Violence_1),
  NVM_Violence_2_count = sum(NVM_Violence_2),
  NVM_Violence_3_count = sum(NVM_Violence_3),
  NVM_Violence_4_count = sum(NVM_Violence_4),
  NVM_Violence_1_deaths = sum(kil_total[NVM_Violence_1 == 1]),
  NVM_Violence_2_deaths = sum(kil_total[NVM_Violence_2 == 1]),
  NVM_Violence_3_deaths = sum(kil_total[NVM_Violence_3 == 1]),
  NVM_Violence_4_deaths = sum(kil_total[NVM_Violence_4 == 1]),
  NVM_Violence_1_injured = sum(inj_total[NVM_Violence_1 == 1]),
  NVM_Violence_2_injured = sum(inj_total[NVM_Violence_2 == 1]),
  NVM_Violence_3_injured = sum(inj_total[NVM_Violence_3 == 1]),
  NVM_Violence_4_injured = sum(inj_total[NVM_Violence_4 == 1]),
  NVM_Violence_1_casualties = sum(kil_total[NVM_Violence_1 == 1]) + sum(inj_total[NVM_Violence_1 == 1]),
  NVM_Violence_2_casualties = sum(kil_total[NVM_Violence_2 == 1]) + sum(inj_total[NVM_Violence_2 == 1]),
  NVM_Violence_3_casualties = sum(kil_total[NVM_Violence_3 == 1]) + sum(inj_total[NVM_Violence_3 == 1]),
  NVM_Violence_4_casualties = sum(kil_total[NVM_Violence_4 == 1]) + sum(inj_total[NVM_Violence_4 == 1]),
  NVM_Violence_1_damage = sum(building_damage_destroy[NVM_Violence_1 == 1]),
  NVM_Violence_2_damage = sum(building_damage_destroy[NVM_Violence_2 == 1]),
  NVM_Violence_3_damage = sum(building_damage_destroy[NVM_Violence_3 == 1]),
  NVM_Violence_4_damage = sum(building_damage_destroy[NVM_Violence_4 == 1])
)
, by = kecamatan_2004]



###############################
#Collapse to DPRD constituency#
###############################

#Load kecamatan to DPRD crosswalks
##################################

#2004
kec_to_dprd_2004 = fread("./crosswalks/kecamatan_to_dprd2_2004.csv")
setnames(kec_to_dprd_2004, 'DAPIL.NUMBER', "DAPIL_NUMBER")
kec_to_dprd_2004[KECA %in% 1, KECA := 10]
kec_to_dprd_2004[, id_kec := paste0(PROP, sprintf("%02.f", as.numeric(KABU)), sprintf("%03.f", as.numeric(KECA)))]
kec_to_dprd_2004[, dapil := paste(KAB_NAME, DAPIL_NUMBER)]
kec_to_dprd_2004 = kec_to_dprd_2004[!is.na(DAPIL_NUMBER)]
kec_to_dprd_2004 = kec_to_dprd_2004[PROP %in% provinces_use$kodebpsprop, 
                                    list(provinsi = PROV_NAME,
                                         id_prov = PROP,
                                         kabupaten = KAB_NAME,
                                         id_kab = KAB_CODE,
                                      id_kec = as.numeric(id_kec), dapil)]
kec_to_dprd_2004[provinsi %in% "IRIAN JAYA BARAT", id_prov := 91]


#2009
kec_to_dprd_2009 = fread("./crosswalks/kecamatan_to_dprd2_2009.csv")
kec_to_dprd_2009 = kec_to_dprd_2009[id_prov %in% provinces_use$kodebpsprop, 
                                    list(provinsi = provinsi, id_prov = provno,
                                         kabupaten = kabkot, id_kab = id_kab,
                                         id_kec = as.numeric(id_kec), dapil = label)]

#Merge in dapil codes
#####################
setkey(wb_2004, kecamatan_2004)
setkey(wb_2009, kecamatan_2009)
setkey(wb_2004_daily, kecamatan_2004)
setkey(wb_2009_daily, kecamatan_2009)
setkey(wb_2004_placebo, kecamatan_2004)
setkey(wb_2009_placebo, kecamatan_2009)
setkey(kec_to_dprd_2009, id_kec)
setkey(kec_to_dprd_2004, id_kec)

wb_2004_merged = wb_2004[kec_to_dprd_2004]
wb_2009_merged = wb_2009[kec_to_dprd_2009]

wb_2004_daily_merged = wb_2004_daily[kec_to_dprd_2004]
wb_2009_daily_merged = wb_2009_daily[kec_to_dprd_2009]


wb_2004_placebo_merged = wb_2004_placebo[kec_to_dprd_2004]
wb_2009_placebo_merged = wb_2009_placebo[kec_to_dprd_2009]

#Sum or 0
sum0 <- function(x) {
  if(all(is.na(x))){
    c(0L)} else {
      as.integer(sum(x,na.rm = TRUE))}
}

#Collapse
violence_vars = names(wb_2004_merged)[str_detect(names(wb_2004_merged), '^NVM_Violence')]
nvms_dapil_2004 = wb_2004_merged[, 
                                  lapply(.SD, sum0)
                                  , 
                                  list(provinsi, id_prov , kabupaten, id_kab, dapil),
                                  .SDcols = violence_vars]
nvms_dapil_2004[, election_cycle := 2004]

nvms_dapil_2009 = wb_2009_merged[, 
                                 lapply(.SD, sum0)
                                 , 
                                 list(provinsi, id_prov , kabupaten, id_kab, dapil),
                                 .SDcols = violence_vars]
nvms_dapil_2009[, election_cycle := 2009]

#daily panel
violence_vars = names(wb_2004_daily_merged)[str_detect(names(wb_2004_daily_merged), '^NVM_Violence')]
nvms_dapil_2004_daily = wb_2004_daily_merged[, 
                                 lapply(.SD, sum0)
                                 , 
                                 list(provinsi, id_prov , kabupaten, id_kab, dapil, date),
                                 .SDcols = violence_vars]
nvms_dapil_2004_daily[, election_cycle := 2004]

nvms_dapil_2009_daily = wb_2009_daily_merged[, 
                                 lapply(.SD, sum0)
                                 , 
                                 list(provinsi, id_prov , kabupaten, id_kab, dapil, date),
                                 .SDcols = violence_vars]
nvms_dapil_2009_daily[, election_cycle := 2009]



#Placebos
violence_vars = names(wb_2004_placebo_merged)[str_detect(names(wb_2004_placebo_merged), '^NVM_Violence')]

nvms_dapil_2004_placebo = wb_2004_placebo_merged[, 
                                 lapply(.SD, sum0)
                                 , 
                                 list(provinsi, id_prov , kabupaten, id_kab, dapil),
                                 .SDcols = violence_vars]
nvms_dapil_2004_placebo[, election_cycle := 2004]

nvms_dapil_2009_placebo = wb_2009_placebo_merged[, 
                                 lapply(.SD, sum0)
                                 , 
                                 list(provinsi, id_prov , kabupaten, id_kab, dapil),
                                 .SDcols = violence_vars]
nvms_dapil_2009_placebo[, election_cycle := 2009]


##
#Distributions of counts
##
#hist(nvms_dapil_2004$NVM_Violence_3_count)
#sum(nvms_dapil_2009$NVM_Violence_1_count)

#########################
#Make kecamatan clusters#
#########################


#Load kecamatan to DPRD crosswalks
##################################

#2004
kec_to_dprd_2004 = fread("./crosswalks/kecamatan_to_dprd2_2004.csv")
setnames(kec_to_dprd_2004, 'DAPIL.NUMBER', "DAPIL_NUMBER")
kec_to_dprd_2004[KECA %in% 1, KECA := 10]
kec_to_dprd_2004[, id_kec := paste0(PROP, sprintf("%02.f", as.numeric(KABU)), sprintf("%03.f", as.numeric(KECA)))]
kec_to_dprd_2004[, dapil := paste(KAB_NAME, DAPIL_NUMBER)]
kec_to_dprd_2004 = kec_to_dprd_2004[!is.na(DAPIL_NUMBER)]
kec_to_dprd_2004 = kec_to_dprd_2004[, 
                                    list(provinsi = PROV_NAME,
                                         id_prov = PROP,
                                         kabupaten = KAB_NAME,
                                         id_kab = KAB_CODE,
                                         id_kec = as.numeric(id_kec), dapil)]
kec_to_dprd_2004[provinsi %in% "IRIAN JAYA BARAT", id_prov := 91]


#2009
kec_to_dprd_2009 = fread("./crosswalks/kecamatan_to_dprd2_2009.csv")
kec_to_dprd_2009 = kec_to_dprd_2009[, 
                                    list(provinsi = provinsi, id_prov = provno,
                                         kabupaten = kabkot, id_kab = id_kab,
                                         id_kec = as.numeric(id_kec), dapil = label)]


crosswalk_2009_2004 = fread('./crosswalks/kecamatan_dprd2009_to_dprd2004.csv') %>% .[, lapply(.SD, as.integer)]
setnames(crosswalk_2009_2004, c('target_kecamatan', 'from_kecamatan'), c('kecamatan_2004', 'kecamatan_2009'))

setkey(crosswalk_2009_2004, kecamatan_2009)
setkey(kec_to_dprd_2009, id_kec)

dprd_2009_2004 = crosswalk_2009_2004[kec_to_dprd_2009][, list(kecamatan_2004, id_kab_2009 = id_kab, dapil_2009 = dapil, id_prov_2009 = id_prov)]  %>% .[!is.na(kecamatan_2004)]
dprd_2004 = kec_to_dprd_2004[, list(kecamatan_2004 = id_kec, id_kab_2004 = id_kab, dapil_2004 = dapil, id_prov_2004 = id_prov)] %>% .[!is.na(kecamatan_2004)]
  

setkey(dprd_2004, kecamatan_2004)
setkey(dprd_2009_2004, kecamatan_2004)

dprd_paired = dprd_2009_2004[dprd_2004]
dprd_paired = dprd_paired[, list(links = length(unique(kecamatan_2004))), by = list(id_kab_2004, dapil_2004, id_kab_2009, dapil_2009)]
dprd_paired[!is.na(dapil_2004), from := paste("2004",id_kab_2004, dapil_2004, sep = ":")]
dprd_paired[!is.na(dapil_2009), to := paste("2009",id_kab_2009, dapil_2009, sep = ":")]

u_dprd = dprd_paired[, list(to, from)] %>% unlist %>% unique %>% .[!is.na(.)] 
vertices = data.table(u_dprd)

dprd_graph = dprd_paired[!is.na(to) & !is.na(from), list(to, from)] %>% graph_from_data_frame(directed = F, vertices = vertices)

groups = clusters(dprd_graph)

group_list = split(names(groups$membership), groups$membership)

dapil2cluster = mapply(function(x,y) data.table(dapil_tag = x, cluster = y), 
       group_list, names(group_list), SIMPLIFY = F) %>%
    rbindlist()

dapil2cluster[, year := str_extract(dapil_tag, '^\\d{4}')]
dapil2cluster[, year_flag := year %>% unique %>% length, by = cluster]
dapil2cluster[, count := dapil_tag %>% unique %>% length, by = cluster]

#dapil_check = dapil2cluster[count > 12 | year_flag == 1, ]
#fwrite(dapil_check, '../dapils_check_kecamatan.csv')

dapil2cluster[, id_kab := str_extract(dapil_tag, '(?<=\\d{4}:)\\d+') ]
dapil2cluster[, dapil := str_extract(dapil_tag, '(?<=\\d{4}:)[^:]+$') ]

dapil_cluster_table = dapil2cluster[, list(dapil, id_kab = id_kab %>% as.numeric, election_cycle = as.numeric(year), cluster, cl_year_flag = year_flag, cl_count = count)]

write.csv(dapil_cluster_table, './crosswalks/dapil_clusters.csv', row.names = F)

#Merge clusters to violence data
setkey(dapil_cluster_table, election_cycle, id_kab, dapil)
setkey(nvms_dapil_2004, election_cycle, id_kab, dapil)
setkey(nvms_dapil_2009, election_cycle, id_kab, dapil)
setkey(nvms_dapil_2004_daily, election_cycle, id_kab, dapil)
setkey(nvms_dapil_2009_daily, election_cycle, id_kab, dapil)
setkey(nvms_dapil_2004_placebo, election_cycle, id_kab, dapil)
setkey(nvms_dapil_2009_placebo, election_cycle, id_kab, dapil)

nvms_dapil_2004 = dapil_cluster_table[nvms_dapil_2004]
nvms_dapil_2009 = dapil_cluster_table[nvms_dapil_2009]

nvms_dapil_2004_daily = dapil_cluster_table[nvms_dapil_2004_daily]
nvms_dapil_2009_daily = dapil_cluster_table[nvms_dapil_2009_daily]

nvms_dapil_2004_placebo = dapil_cluster_table[nvms_dapil_2004_placebo]
nvms_dapil_2009_placebo = dapil_cluster_table[nvms_dapil_2009_placebo]

nvms_dapil = rbindlist(list(nvms_dapil_2004, nvms_dapil_2009), use.names = T)
nvms_dapil[is.na(cluster), cluster := -(1:.N) %>% as.character]

nvms_dapil_placebo = rbindlist(list(nvms_dapil_2004_placebo, nvms_dapil_2009_placebo), use.names = T)
nvms_dapil_placebo[is.na(cluster), cluster := -(1:.N) %>% as.character]

nvms_dapil_daily = rbindlist(list(nvms_dapil_2004_daily, nvms_dapil_2009_daily), use.names = T)
nvms_dapil_daily[is.na(cluster), cluster := -(1:.N) %>% as.character]


#Make binary violence data#
###########################

#NVMS
nvms_dapil[, paste0('NVM_Violence_',1:4,'_binary') := lapply(.SD, function(x) x > 0), 
           .SDcols = paste0('NVM_Violence_',1:4,'_count')]

nvms_dapil[, paste0('NVM_Violence_',1:4,'_binary_f3') := lapply(.SD, function(x) x > 0), 
           .SDcols = paste0('NVM_Violence_',1:4,'_count_f3')]

nvms_dapil[, paste0('NVM_Violence_',1:4,'_binary_l3') := lapply(.SD, function(x) x > 0), 
           .SDcols = paste0('NVM_Violence_',1:4,'_count_l3')]

nvms_dapil[, paste0('NVM_Violence_',1:4,'_deaths_binary') := lapply(.SD, function(x) x > 0), 
           .SDcols = paste0('NVM_Violence_',1:4,'_deaths')]

nvms_dapil[, paste0('NVM_Violence_',1:4,'_injured_binary') := lapply(.SD, function(x) x > 0), 
           .SDcols = paste0('NVM_Violence_',1:4,'_injured')]

nvms_dapil[, paste0('NVM_Violence_',1:4,'_casualties_binary') := lapply(.SD, function(x) x > 0), 
           .SDcols = paste0('NVM_Violence_',1:4,'_casualties')]

nvms_dapil[, paste0('NVM_Violence_',1:4,'_damage_binary') := lapply(.SD, function(x) x > 0), 
           .SDcols = paste0('NVM_Violence_',1:4,'_damage')]

nvms_dapil[, NVM_Violence_1_intensive_binary := rowSums(.SD) > 0, 
           .SDcols = paste0('NVM_Violence_1_', c('deaths', 'injured', 'damage'), '_binary')]
nvms_dapil[, NVM_Violence_2_intensive_binary := rowSums(.SD) > 0, 
           .SDcols = paste0('NVM_Violence_2_', c('deaths', 'injured', 'damage'), '_binary')]
nvms_dapil[, NVM_Violence_3_intensive_binary := rowSums(.SD) > 0, 
           .SDcols = paste0('NVM_Violence_3_', c('deaths', 'injured', 'damage'), '_binary')]
nvms_dapil[, NVM_Violence_4_intensive_binary := rowSums(.SD) > 0, 
           .SDcols = paste0('NVM_Violence_4_', c('deaths', 'injured', 'damage'), '_binary')]


#NVMS placebo
nvms_dapil_placebo[, paste0('NVM_Violence_',1:4,'_binary') := lapply(.SD, function(x) x > 0), 
           .SDcols = paste0('NVM_Violence_',1:4,'_count')]

nvms_dapil_placebo[, paste0('NVM_Violence_',1:4,'_deaths_binary') := lapply(.SD, function(x) x > 0), 
           .SDcols = paste0('NVM_Violence_',1:4,'_deaths')]

nvms_dapil_placebo[, paste0('NVM_Violence_',1:4,'_injured_binary') := lapply(.SD, function(x) x > 0), 
           .SDcols = paste0('NVM_Violence_',1:4,'_injured')]

nvms_dapil_placebo[, paste0('NVM_Violence_',1:4,'_casualties_binary') := lapply(.SD, function(x) x > 0), 
           .SDcols = paste0('NVM_Violence_',1:4,'_casualties')]

nvms_dapil_placebo[, paste0('NVM_Violence_',1:4,'_damage_binary') := lapply(.SD, function(x) x > 0), 
           .SDcols = paste0('NVM_Violence_',1:4,'_damage')]


#Clean up
drop = setdiff(ls(), c(old_ws, 'nvms_dapil', 'nvms_dapil_placebo', 'nvms_dapil_daily')) 
rm(list = drop)
setwd(oldwd)